""" Pyparsing parser for BibTeX files

A standalone parser using pyparsing.

pyparsing has a simple and expressive syntax so the grammar is easy to read and
write.

Submitted by Matthew Brett, 2010

Simplified BSD license
"""

from pyparsing import (
    Regex,
    Suppress,
    ZeroOrMore,
    Group,
    Optional,
    Forward,
    SkipTo,
    CaselessLiteral,
    Dict,
)


class Macro:
    """Class to encapsulate undefined macro references"""

    def __init__(self, name):
        self.name = name

    def __repr__(self):
        return f'Macro("{self.name}")'

    def __eq__(self, other):
        return self.name == other.name


# Character literals
LCURLY, RCURLY, LPAREN, RPAREN, QUOTE, COMMA, AT, EQUALS, HASH = map(
    Suppress, '{}()",@=#'
)


def bracketed(expr):
    """Return matcher for `expr` between curly brackets or parentheses"""
    return (LPAREN + expr + RPAREN) | (LCURLY + expr + RCURLY)


# Define parser components for strings (the hard bit)
chars_no_curly = Regex(r"[^{}]+")
chars_no_curly.leave_whitespace()
chars_no_quotecurly = Regex(r'[^"{}]+')
chars_no_quotecurly.leave_whitespace()
# Curly string is some stuff without curlies, or nested curly sequences
curly_string = Forward()
curly_item = Group(curly_string) | chars_no_curly
curly_string << LCURLY + ZeroOrMore(curly_item) + RCURLY
# quoted string is either just stuff within quotes, or stuff within quotes, within
# which there is nested curliness
quoted_item = Group(curly_string) | chars_no_quotecurly
quoted_string = QUOTE + ZeroOrMore(quoted_item) + QUOTE

# Numbers can just be numbers. Only integers though.
number = Regex("[0-9]+")

# Basis characters (by exclusion) for variable / field names.  The following
# list of characters is from the btparse documentation
any_name = Regex("[^\\s\"#%'(),={}]+")

# btparse says, and the test bibs show by experiment, that macro and field names
# cannot start with a digit.  In fact entry type names cannot start with a digit
# either (see tests/bibs). Cite keys can start with a digit
not_digname = Regex("[^\\d\\s\"#%'(),={}][^\\s\"#%'(),={}]*")

# Comment comments out to end of line
comment = AT + CaselessLiteral("comment") + Regex(r"[\s{(].*").leave_whitespace()

# The name types with their digiteyness
not_dig_lower = not_digname.copy().set_parse_action(lambda t: t[0].lower())
macro_def = not_dig_lower.copy()
macro_ref = not_dig_lower.copy().set_parse_action(lambda t: Macro(t[0].lower()))
field_name = not_dig_lower.copy()
# Spaces in names mean they cannot clash with field names
entry_type = not_dig_lower("entry_type")
cite_key = any_name("cite_key")
# Number has to be before macro name
string = number | macro_ref | quoted_string | curly_string

# There can be hash concatenation
field_value = string + ZeroOrMore(HASH + string)
field_def = Group(field_name + EQUALS + field_value)
entry_contents = Dict(ZeroOrMore(field_def + COMMA) + Optional(field_def))

# Entry is surrounded either by parentheses or curlies
entry = AT + entry_type + bracketed(cite_key + COMMA + entry_contents)

# Preamble is a macro-like thing with no name
preamble = AT + CaselessLiteral("preamble") + bracketed(field_value)

# Macros (aka strings)
macro_contents = macro_def + EQUALS + field_value
macro = AT + CaselessLiteral("string") + bracketed(macro_contents)

# Implicit comments
icomment = SkipTo("@").set_parse_action(lambda t: t.insert(0, "icomment"))

# entries are last in the list (other than the fallback) because they have
# arbitrary start patterns that would match comments, preamble or macro
definitions = Group(comment | preamble | macro | entry | icomment)

# Start symbol
bibfile = ZeroOrMore(definitions)


def parse_str(str):
    return bibfile.parse_string(str)


if __name__ == "__main__":
    # Run basic test
    txt = """
Some introductory text
(implicit comment)

@ARTICLE{Authors2011,
  author = {First Author and Second Author and Third Author},
  title = {An article about {S}omething},
  journal = "Journal of Articles",
  year = {2011},
  volume = {16},
  pages = {1140--1141},
  number = {2}
}
"""
    print("\n\n".join(defn.dump() for defn in parse_str(txt)))
